{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "afd55886-5f5b-4794-838e-ef8179fb0394",
   "metadata": {},
   "source": [
    "##### **** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n",
    "```\n",
    "make venv \n",
    "source venv/bin/activate \n",
    "pip install jupyterlab\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%capture\n",
    "## This is here as a reference only\n",
    "# Users and application developers must use the right tag for the latest from pypi\n",
    "%pip install data-prep-toolkit-transforms[similarity]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ebf1f782-0e61-485c-8670-81066beb734c",
   "metadata": {},
   "source": [
    "##### ***** Import required classes and modules"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c2a12abc-9460-4e45-8961-873b48a9ab19",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import sys\n",
    "\n",
    "from data_processing.utils import ParamsUtils\n",
    "from data_processing.runtime.pure_python import PythonTransformLauncher\n",
    "from dpk_similarity.transform_python import SimilarityPythonTransformConfiguration"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7234563c-2924-4150-8a31-4aec98c1bf33",
   "metadata": {},
   "source": [
    "##### ***** Setup runtime parameters for this transform"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e90a853e-412f-45d7-af3d-959e755aeebb",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# create parameters\n",
    "input_folder = os.path.join(\"test-data\", \"input\")\n",
    "output_folder = os.path.join( \"output\")\n",
    "local_conf = {\n",
    "    \"input_folder\": input_folder,\n",
    "    \"output_folder\": output_folder,\n",
    "}\n",
    "\n",
    "params = {\n",
    "    \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n",
    "    \"runtime_pipeline_id\": \"pipeline_id\",\n",
    "    \"runtime_job_id\": \"job_id\",\n",
    "    \"similarity_annotation_column\": \"similarity_score\",\n",
    "    \"similarity_doc_text_column\": \"contents\"\n",
    "}\n",
    "\n",
    "## Will used test data if environment variable not defined\n",
    "if os.environ.get('SIMILARITY_ES_ENDPOINT'):\n",
    "    similarity_params = {\n",
    "        \"similarity_es_endpoint\": os.environ.get('SIMILARITY_ES_ENDPOINT'),\n",
    "        \"similarity_es_userid\": os.environ.get('SIMILARITY_ES_USERID'),\n",
    "        \"similarity_es_pwd\": os.environ.get('SIMILARITY_ES_PWD'),\n",
    "        \"similarity_es_index\": \"cnn_dailymail\",\n",
    "    }\n",
    "else:\n",
    "    similarity_params = {\n",
    "        \"similarity_es_endpoint\": None,\n",
    "        \"similarity_es_index\": \"mydata\"\n",
    "    }\n",
    "    \n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7949f66a-d207-45ef-9ad7-ad9406f8d42a",
   "metadata": {},
   "source": [
    "##### ***** Use python runtime to invoke the transform"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0775e400-7469-49a6-8998-bd4772931459",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%capture\n",
    "# Set the simulated command line args\n",
    "sys.argv = ParamsUtils.dict_to_req(d=params | similarity_params)\n",
    "# create launcher\n",
    "launcher = PythonTransformLauncher(runtime_config=SimilarityPythonTransformConfiguration())\n",
    "launcher.launch()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c3df5adf-4717-4a03-864d-9151cd3f134b",
   "metadata": {},
   "source": [
    "##### **** The specified folder will include the transformed parquet files."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7276fe84-6512-4605-ab65-747351e13a7c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import glob\n",
    "glob.glob(\"output/*\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "845a75cf-f4a9-467d-87fa-ccbac1c9beb8",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
